In [3]:
# !pip install powerbiclient
# #pip install ipywidgets
In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
In [5]:
data = pd.read_csv(r"C:\Users\Sunny\Downloads\shopping_trends.csv")
In [6]:
#data.to_csv("path"    ) // to convert dataframe in to file
In [7]:
data
Out[7]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
0 1 55 Male Blouse Clothing 53 Kentucky L Gray Winter 3.1 Yes Credit Card Express Yes Yes 14 Venmo Fortnightly
1 2 19 Male Sweater Clothing 64 Maine L Maroon Winter 3.1 Yes Bank Transfer Express Yes Yes 2 Cash Fortnightly
2 3 50 Male Jeans Clothing 73 Massachusetts S Maroon Spring 3.1 Yes Cash Free Shipping Yes Yes 23 Credit Card Weekly
3 4 21 Male Sandals Footwear 90 Rhode Island M Maroon Spring 3.5 Yes PayPal Next Day Air Yes Yes 49 PayPal Weekly
4 5 45 Male Blouse Clothing 49 Oregon M Turquoise Spring 2.7 Yes Cash Free Shipping Yes Yes 31 PayPal Annually
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3895 3896 40 Female Hoodie Clothing 28 Virginia L Turquoise Summer 4.2 No Cash 2-Day Shipping No No 32 Venmo Weekly
3896 3897 52 Female Backpack Accessories 49 Iowa L White Spring 4.5 No PayPal Store Pickup No No 41 Bank Transfer Bi-Weekly
3897 3898 46 Female Belt Accessories 33 New Jersey L Green Spring 2.9 No Credit Card Standard No No 24 Venmo Quarterly
3898 3899 44 Female Shoes Footwear 77 Minnesota S Brown Summer 3.8 No PayPal Express No No 24 Venmo Weekly
3899 3900 52 Female Handbag Accessories 81 California M Beige Spring 3.1 No Bank Transfer Store Pickup No No 33 Venmo Quarterly

3900 rows × 19 columns

In [8]:
data.sample(4) # randomely fetch the information
Out[8]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
2276 2277 31 Male Jeans Clothing 82 Wyoming L Red Summer 4.1 No Bank Transfer 2-Day Shipping No No 25 Debit Card Weekly
2368 2369 29 Male Sunglasses Accessories 38 New Jersey M Lavender Summer 3.3 No Credit Card Standard No No 6 Debit Card Every 3 Months
2684 2685 46 Female Dress Clothing 45 Washington M Gray Summer 4.7 No Venmo Free Shipping No No 42 Credit Card Bi-Weekly
2851 2852 58 Female Blouse Clothing 73 Nevada L Teal Summer 3.0 No Bank Transfer Store Pickup No No 42 Debit Card Monthly
In [9]:
data.describe()
Out[9]:
Customer ID Age Purchase Amount (USD) Review Rating Previous Purchases
count 3900.000000 3900.000000 3900.000000 3900.000000 3900.000000
mean 1950.500000 44.068462 59.764359 3.749949 25.351538
std 1125.977353 15.207589 23.685392 0.716223 14.447125
min 1.000000 18.000000 20.000000 2.500000 1.000000
25% 975.750000 31.000000 39.000000 3.100000 13.000000
50% 1950.500000 44.000000 60.000000 3.700000 25.000000
75% 2925.250000 57.000000 81.000000 4.400000 38.000000
max 3900.000000 70.000000 100.000000 5.000000 50.000000
In [10]:
data.describe(include="object")
Out[10]:
Gender Item Purchased Category Location Size Color Season Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Preferred Payment Method Frequency of Purchases
count 3900 3900 3900 3900 3900 3900 3900 3900 3900 3900 3900 3900 3900 3900
unique 2 25 4 50 4 25 4 2 6 6 2 2 6 7
top Male Blouse Clothing Montana M Olive Spring No Credit Card Free Shipping No No PayPal Every 3 Months
freq 2652 171 1737 96 1755 177 999 2847 696 675 2223 2223 677 584
In [11]:
data.shape
Out[11]:
(3900, 19)
In [12]:
shop_dup = data[data.duplicated()]
shop_dup
Out[12]:
Customer ID Age Gender Item Purchased Category Purchase Amount (USD) Location Size Color Season Review Rating Subscription Status Payment Method Shipping Type Discount Applied Promo Code Used Previous Purchases Preferred Payment Method Frequency of Purchases
In [13]:
data.dtypes
Out[13]:
Customer ID                   int64
Age                           int64
Gender                       object
Item Purchased               object
Category                     object
Purchase Amount (USD)         int64
Location                     object
Size                         object
Color                        object
Season                       object
Review Rating               float64
Subscription Status          object
Payment Method               object
Shipping Type                object
Discount Applied             object
Promo Code Used              object
Previous Purchases            int64
Preferred Payment Method     object
Frequency of Purchases       object
dtype: object
In [14]:
data.columns
Out[14]:
Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
       'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
       'Review Rating', 'Subscription Status', 'Payment Method',
       'Shipping Type', 'Discount Applied', 'Promo Code Used',
       'Previous Purchases', 'Preferred Payment Method',
       'Frequency of Purchases'],
      dtype='object')
In [15]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer ID               3900 non-null   int64  
 1   Age                       3900 non-null   int64  
 2   Gender                    3900 non-null   object 
 3   Item Purchased            3900 non-null   object 
 4   Category                  3900 non-null   object 
 5   Purchase Amount (USD)     3900 non-null   int64  
 6   Location                  3900 non-null   object 
 7   Size                      3900 non-null   object 
 8   Color                     3900 non-null   object 
 9   Season                    3900 non-null   object 
 10  Review Rating             3900 non-null   float64
 11  Subscription Status       3900 non-null   object 
 12  Payment Method            3900 non-null   object 
 13  Shipping Type             3900 non-null   object 
 14  Discount Applied          3900 non-null   object 
 15  Promo Code Used           3900 non-null   object 
 16  Previous Purchases        3900 non-null   int64  
 17  Preferred Payment Method  3900 non-null   object 
 18  Frequency of Purchases    3900 non-null   object 
dtypes: float64(1), int64(4), object(14)
memory usage: 579.0+ KB
In [16]:
data.isnull().sum()
Out[16]:
Customer ID                 0
Age                         0
Gender                      0
Item Purchased              0
Category                    0
Purchase Amount (USD)       0
Location                    0
Size                        0
Color                       0
Season                      0
Review Rating               0
Subscription Status         0
Payment Method              0
Shipping Type               0
Discount Applied            0
Promo Code Used             0
Previous Purchases          0
Preferred Payment Method    0
Frequency of Purchases      0
dtype: int64
In [17]:
print(f"The unique values of the 'Gender' column are :{data['Gender'] .unique()}")
The unique values of the 'Gender' column are :['Male' 'Female']
In [18]:
print(f"The unique values of the 'Item Purchased' column are :{data['Item Purchased'] .unique()}")
The unique values of the 'Item Purchased' column are :['Blouse' 'Sweater' 'Jeans' 'Sandals' 'Sneakers' 'Shirt' 'Shorts' 'Coat'
 'Handbag' 'Shoes' 'Dress' 'Skirt' 'Sunglasses' 'Pants' 'Jacket' 'Hoodie'
 'Jewelry' 'T-shirt' 'Scarf' 'Hat' 'Socks' 'Backpack' 'Belt' 'Boots'
 'Gloves']
In [19]:
print(f"The unique values of the 'Location' column are :{data['Location'] .unique()}")
print()
The unique values of the 'Location' column are :['Kentucky' 'Maine' 'Massachusetts' 'Rhode Island' 'Oregon' 'Wyoming'
 'Montana' 'Louisiana' 'West Virginia' 'Missouri' 'Arkansas' 'Hawaii'
 'Delaware' 'New Hampshire' 'New York' 'Alabama' 'Mississippi'
 'North Carolina' 'California' 'Oklahoma' 'Florida' 'Texas' 'Nevada'
 'Kansas' 'Colorado' 'North Dakota' 'Illinois' 'Indiana' 'Arizona'
 'Alaska' 'Tennessee' 'Ohio' 'New Jersey' 'Maryland' 'Vermont'
 'New Mexico' 'South Carolina' 'Idaho' 'Pennsylvania' 'Connecticut' 'Utah'
 'Virginia' 'Georgia' 'Nebraska' 'Iowa' 'South Dakota' 'Minnesota'
 'Washington' 'Wisconsin' 'Michigan']

In [20]:
print(f"The unique values of the 'Category' column are :{data['Category'] .unique()}")
print()
print(f"The unique values of the 'Size ' column are :{data['Size'] .unique()}")
print()
print(f"The unique values of the 'Subscription Status ' column are :{data['Subscription Status'] .unique()}")
print()
print(f"The unique values of the 'Payment Method' column are :{data['Payment Method'] .unique()}")
print()
print(f"The unique values of the 'Shipping Type' column are :{data['Shipping Type'] .unique()}")
print()
print(f"The unique values of the 'Discount Applied  ' column are :{data['Discount Applied'] .unique()}")
print()
print(f"The unique values of the 'Promo Code Used  ' column are :{data['Promo Code Used'] .unique()}")
print()
print(f"The unique values of the 'Preferred Payment Method  ' column are :{data['Preferred Payment Method'] .unique()}")
The unique values of the 'Category' column are :['Clothing' 'Footwear' 'Outerwear' 'Accessories']

The unique values of the 'Size ' column are :['L' 'S' 'M' 'XL']

The unique values of the 'Subscription Status ' column are :['Yes' 'No']

The unique values of the 'Payment Method' column are :['Credit Card' 'Bank Transfer' 'Cash' 'PayPal' 'Venmo' 'Debit Card']

The unique values of the 'Shipping Type' column are :['Express' 'Free Shipping' 'Next Day Air' 'Standard' '2-Day Shipping'
 'Store Pickup']

The unique values of the 'Discount Applied  ' column are :['Yes' 'No']

The unique values of the 'Promo Code Used  ' column are :['Yes' 'No']

The unique values of the 'Preferred Payment Method  ' column are :['Venmo' 'Cash' 'Credit Card' 'PayPal' 'Bank Transfer' 'Debit Card']
In [21]:
data['Age'].value_counts() # how many counts based on age 
Out[21]:
Age
69    88
57    87
41    86
25    85
49    84
50    83
54    83
27    83
62    83
32    82
19    81
58    81
42    80
43    79
28    79
31    79
37    77
46    76
29    76
68    75
59    75
63    75
56    74
36    74
55    73
52    73
64    73
35    72
51    72
65    72
40    72
45    72
47    71
66    71
30    71
23    71
38    70
53    70
18    69
21    69
26    69
34    68
48    68
24    68
39    68
70    67
22    66
61    65
60    65
33    63
20    62
67    54
44    51
Name: count, dtype: int64
In [22]:
data['Age'].mean()
Out[22]:
44.06846153846154
In [ ]:
 

Descriptive Analysis¶

In [23]:
# analysis past data to understand trends and patterns.
In [24]:
shop =data['Gender'].value_counts()
shop.plot(kind = 'pie', explode=(0,0.1),autopct = '%1.1f%%')
plt.xlabel("Gender")
Out[24]:
Text(0.5, 0, 'Gender')
No description has been provided for this image
In [25]:
#data['Gender'].value_counts().plot(kind ='bar'
In [26]:
data['Age_category'] = pd.cut(data['Age'],bins=[0,15,18,30,50,70],labels=['child' , 'teen',
                                                                       'Multi-aged adults',
                                                                          'young Adult','old'])
In [27]:
fig = px.histogram(data , y='Age' , x ='Age_category')
fig.show()

How does the average purchase amount vary accross different product ?¶

In [28]:
data['Category'].unique()
Out[28]:
array(['Clothing', 'Footwear', 'Outerwear', 'Accessories'], dtype=object)
In [29]:
data.groupby('Category')['Purchase Amount (USD)'].mean()
Out[29]:
Category
Accessories    59.838710
Clothing       60.025331
Footwear       60.255426
Outerwear      57.172840
Name: Purchase Amount (USD), dtype: float64
In [30]:
plt.figure(figsize=(20,6))
data1 = data['Category'].value_counts()
explode =[0.1]*len(data1)
data1.plot(kind ='pie',explode =explode,autopct= '%1.1f%%')
plt.xlabel('Category')
plt.legend()
plt.show()
No description has been provided for this image

Which Gender has the highest number of purchases ??¶

In [31]:
sns.barplot(data , x='Gender' , y= 'Purchase Amount (USD)')
Out[31]:
<Axes: xlabel='Gender', ylabel='Purchase Amount (USD)'>
No description has been provided for this image

Seasons or months where customer spending is significantly higher¶

In [32]:
# seasons sales
data2 = data['Season'].value_counts()
data2
Out[32]:
Season
Spring    999
Fall      975
Winter    971
Summer    955
Name: count, dtype: int64
In [33]:
plt.figure(figsize=(20,6))
data1 = data['Season'].value_counts()
explode =[0.1]*len(data1)
data1.plot(kind ='pie',explode =explode,autopct= '%1.1f%%')
plt.xlabel('Category')
plt.legend()
plt.show()
No description has been provided for this image

Payment MEthod is the most used by customers¶

In [34]:
data.groupby('Payment Method')['Purchase Amount (USD)'].mean().sort_values(ascending=False)
Out[34]:
Payment Method
Venmo            61.241960
Credit Card      61.159483
Cash             59.927469
Bank Transfer    58.738924
PayPal           58.697492
Debit Card       58.638231
Name: Purchase Amount (USD), dtype: float64
In [35]:
sns.barplot(x="Payment Method",y='Purchase Amount (USD)',data =data)
Out[35]:
<Axes: xlabel='Payment Method', ylabel='Purchase Amount (USD)'>
No description has been provided for this image

What are the most commonly purchased items in each category ??¶

In [36]:
data.groupby('Category')['Item Purchased'].value_counts()
Out[36]:
Category     Item Purchased
Accessories  Jewelry           171
             Belt              161
             Sunglasses        161
             Scarf             157
             Hat               154
             Handbag           153
             Backpack          143
             Gloves            140
Clothing     Blouse            171
             Pants             171
             Shirt             169
             Dress             166
             Sweater           164
             Socks             159
             Skirt             158
             Shorts            157
             Hoodie            151
             T-shirt           147
             Jeans             124
Footwear     Sandals           160
             Shoes             150
             Sneakers          145
             Boots             144
Outerwear    Jacket            163
             Coat              161
Name: count, dtype: int64
In [37]:
fig = px.histogram(data, x= 'Item Purchased', color= 'Category')
fig.show()
In [38]:
# Extract Correlation Features
In [39]:
num_fea = data[['Age','Purchase Amount (USD)','Review Rating','Previous Purchases']]
num_fea.head()
Out[39]:
Age Purchase Amount (USD) Review Rating Previous Purchases
0 55 53 3.1 14
1 19 64 3.1 2
2 50 73 3.1 23
3 21 90 3.5 49
4 45 49 2.7 31
In [40]:
 corr = num_fea.corr()
corr
Out[40]:
Age Purchase Amount (USD) Review Rating Previous Purchases
Age 1.000000 -0.010424 -0.021949 0.040445
Purchase Amount (USD) -0.010424 1.000000 0.030776 0.008063
Review Rating -0.021949 0.030776 1.000000 0.004229
Previous Purchases 0.040445 0.008063 0.004229 1.000000
In [41]:
#plt.figure(figsize(5,3))
sns.heatmap(corr,annot=True,cmap ='BuPu')
Out[41]:
<Axes: >
No description has been provided for this image
In [42]:
 
In [43]:
 
In [ ]:
 
In [45]:
 
In [ ]:
 
In [48]:
 
In [49]:
 
In [52]:
 
In [ ]: